In [ ]:
import sys
import cPickle
import numpy
import scipy
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
%matplotlib inline
In [ ]:
# some seaborn initialization for prettier plots
def init_seaborn():
sns.set_style('darkgrid')
sns.set_palette('muted')
sns.set_context("notebook", font_scale=1.5,
rc={"lines.linewidth": 2.5})
RS = 20151012
init_seaborn()
In [ ]:
# read vectors, one word per row
def read_data(fn):
d = cPickle.load(open(fn))
labels, vectors = {}, []
for k, v in d.iteritems():
labels[k] = len(labels)
vectors.append(v)
return labels, numpy.array(vectors)
In [ ]:
labels, vectors = read_data("all_feat_vectors")
print 'nal', 'ies', scipy.spatial.distance.cosine(vectors[labels['nal']], vectors[labels['ies']])
In [ ]:
tsne = TSNE(random_state=RS, learning_rate=200, verbose=2, perplexity=20, metric=scipy.spatial.distance.cosine)
proj = tsne.fit_transform(vectors)
In [ ]:
proj.shape
In [ ]:
proj[labels['lly']]
In [ ]:
f = plt.figure(figsize=(12, 12))
ax = plt.subplot(aspect='equal')
sc = ax.scatter(proj[:,0], proj[:,1], lw=0, s=40)
plt.xlim(-25, 25)
plt.ylim(-25, 25)
ax.axis('off')
ax.axis('tight')
to_annotate = ['ion', 'ity', 'ism', 'tor', 'age',
'ncy', 'hip', 'ium', 'ney', 'cer',
'ked', 'ged', 'red', 'ied', 'ced',
'tic', 'ful', 'ary', 'cal', 'lar']
for l in to_annotate:
ax.annotate(l, proj[labels[l]], color='g', fontsize=20)
plt.show()